output_results = "../results/"
output_data = "../results/"
ps_deseq <- readRDS(paste0(output_results, "Filtered/ps_deseq_friedfilt.rds"))
ps_css <- readRDS(paste0(output_results,"Filtered/ps_css_friedfilt.rds"))
ps_no_norm <- readRDS(paste0(output_results,"Filtered/ps_no_norm_friedfilt.rds"))
ps_not_norm_comp <- readRDS(paste0("../data/ps_no_norm_age_bf_filt_complete_family.rds"))

#remove samples that are too young and their paired sibling
#tooyoung <-ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Age..months. < 24)]

#family 65 should be removed according to >24 months criteria
#ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != unique(tooyoung), ps_not_norm_comp)

#List of individuals that were reported w/ autism, but was not classified as such through MARA and/or video classifier
#pheno_contrad <-read.csv("../phenotype_contradictions.csv")
#contradicting<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% as.character(pheno_contrad$host_name))])

#remove the whole family
#for (i in contradicting){
 # ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)

#}


#Remove Breastfed and their families
breast_fed <- c("089_A","054_N", "158_N" )

b_fed<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% breast_fed)])

#remove the whole family
for (i in b_fed){
  ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)

}

#Remove Possible Contradictions and their families
possible_phen_contra <- c("020_A","131_A", "184_A" )

p_con<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% possible_phen_contra)])

#remove the whole family
for (i in p_con){
  ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)

}

CCA and visualization

Compare resulting amplicon data between and within sample types by canonical correlation analysis, regression profiling, and visualization (e.g. non-metric multi-dimensional scaling [NMDS], principle coordinates of analysis, principle component analysis).

Constrained by status A or N

plotting_consPcoA <- function(ps, norm, variables){
  #drop samples with NA values in contraints
  keep <- as.vector(apply(ps@sam_data[, variables], 1, function(x) return(!any(is.na(x)))))
  ps <- prune_samples(keep, ps)
  
  #Ease of labeling 
  colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency...longitudinal.", "", variables)
  variables <- gsub("frequency...longitudinal.", "", variables)
  colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency.", "", variables)
  variables <- gsub("frequency.", "", variables)
  
  #include only families that have all 6 timepoints, 3 for ASD and 3 for NT participants, for ease of visualization
  fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
  ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
  sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
  formula <- as.formula(paste("~", variables))
  if(length(variables) > 1){
    formula <- as.formula(paste("~", paste(variables, collapse = "+")))
  }

  ps_pcoa_ord <- ordinate(
    physeq = ps_6fam, 
    method = "CAP", 
    distance = "bray",
    formula = formula
    )
  p <- plot_ordination(
    physeq = ps_6fam, 
    ordination = ps_pcoa_ord, 
    color = variables, 
    axes = c(1,2),
    title= paste("Constrained PcoA",norm,"ordinated by \n", paste(variables, collapse = "\n"))) + 
    geom_point( size = 2) +
    theme_minimal()+
    theme(text = element_text(size =10), plot.title = element_text(size=10))
  if(variables == "phenotype"){
    p <- p + scale_color_manual(values=sgColorPalette)
  }
  
  #sum_pcoA_DesEq<-summary(ps_pcoa_ord)
  erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
  sampledf <- data.frame(sample_data(ps))
  beta_di<-betadisper(erie_bray_sum_pcoA, ps@sam_data$Family.group.ID)
  to_return<-list()
  to_return[[1]]<-p
  to_return[[2]]<-beta_di
  return(to_return)
}

Constrained by phenotype

#With Deseq
DeSeq_distance <- plotting_consPcoA(ps_deseq, "Deseq", variables = "phenotype")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "phenotype")
# plot
CSS_distance[[1]]

Constrained by family

#With Deseq
DeSeq_distance <- plotting_consPcoA(ps_deseq, "Deseq", variables = "Family.group.ID")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "Family.group.ID")
# plot
CSS_distance[[1]]

#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"), 
        xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")

#dev.off()

Constrained by top permanova variables

permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
permanova_res <- permanova_res[2:nrow(permanova_res), ] #First only include 95 samples because optional question
r2_min <- .02
n_sample_min <- 400
variables <- as.character(permanova_res[permanova_res$TotalN > n_sample_min & permanova_res$R2 > r2_min, ]$Variable)
plot_list <- list()
for(variable in variables){
  res <- plotting_consPcoA(ps_deseq, "Deseq", variables = variable)
  plot_list[[variable]] <- res[[1]]
}
plot_grid(plotlist = plot_list, nrow = 2, ncol = 2)

pdf(paste0(output_data, 'pcoa_top_permanova_variables.pdf'), width = 12, height = 9)
plot_grid(plotlist = plot_list, nrow = 2, ncol = 2)
dev.off()
## png 
##   2

Diversity

Characterize and assess the diversity of each sample, and evaluate the extent of dissimilarity between the cohorts

Alpha diversity

ER <- estimate_richness(ps_not_norm_comp, measures=c("Observed", "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER_long <- melt(ER, id.vars=c('rn', 'phenotype', "Family.group.ID", "Within.study.sampling.date"))

# plot
ggplot(data=ER_long[variable!='se.chao1'], aes(x=phenotype, y=value, fill=phenotype))+
  geom_boxplot(width=0.7, outlier.colour='white')+
  geom_jitter(size=1, position=position_jitter(width=0.1))+
  xlab('')+ylab('')+
  scale_fill_manual(values=sgColorPalette)+
  theme_minimal()+facet_wrap(~variable, scales='free')

# run t-test to check significance
ttest=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
  ttest_res=t.test(value ~ phenotype, data=ER_long[ER_long$variable==alphad], var.equal=TRUE)
  ttest=rbindlist(list(ttest, data.table(alpha_index=alphad, pvalue=ttest_res$p.value)))
}

pander(ttest)
alpha_index pvalue
Observed 0.2555
Chao1 0.2555
Shannon 0.7071

Beta diversity

#Let's do a PcoA #not much differences 
GP.ord <- ordinate(ps_deseq, "PCoA", "bray")
p2 = plot_ordination(ps_deseq, GP.ord, type="samples", color="phenotype") +
  geom_point( size = 1)+
  scale_color_manual(values=sgColorPalette)+
  theme_minimal()
p2

# function to plot PCoA without NA points
wo_na_pcoa <- function(ps, pvar, ord_res){
  # ord_res: ordinated object
  
  keepid=!is.na(get_variable(ps, pvar)) &
    get_variable(ps, pvar)!='NA' &
    get_variable(ps, pvar)!=''
  tmp_ps <- prune_samples(keepid, ps)
  
  # get subset counts and metadata together
  DF <- cbind(ord_res$vectors[row.names(sample_data(tmp_ps)), 1:2], sample_data(tmp_ps)[, pvar])
  setnames(DF, pvar, 'testvar')
  
  # get eigenvalues
  eig=(ord_res$values$Eigenvalues/sum(ord_res$values$Eigenvalues))[1:2]*100
  
  p <- ggplot(data=DF, aes(x=Axis.1, y=Axis.2, colour=testvar))+
    geom_point(size=2)+
    ggtitle(pvar)+
    xlab(paste0('Axis.1 [', format(eig[1], digits=3), '%]'))+
    ylab(paste0('Axis.2 [', format(eig[2], digits=3), '%]'))+
    theme_minimal()+
    theme(legend.title=element_blank(), legend.position="bottom")
    
  print(p)
}
ord <- ordinate(ps_css, method = "PCoA", distance = "bray")
wo_na_pcoa(ps_css, 'Mobile.Autism.Risk.Assessment.Score', ord)

Probiotics

wo_na_pcoa(ps_deseq, 'Probiotic..consumption.frequency.', GP.ord)

Antibiotics

# Anti.infective
wo_na_pcoa(ps_deseq, 'Anti.infective', GP.ord)

# Minimum.time.since.antibiotics
sample_data(ps_deseq)$Minimum.time.since.antibiotics <- as.numeric(sample_data(ps_deseq)$Minimum.time.since.antibiotics)
wo_na_pcoa(ps_deseq, 'Minimum.time.since.antibiotics', GP.ord)

All other passed R2 cutoff and significant

permanova_pcut = 0.05
permanova_cut = 0.02
permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
for(pvar in permanova_res[permanova_res$R2>permanova_cut & permanova_res$pvalue<permanova_pcut]$Variable){
  wo_na_pcoa(ps_deseq, pvar, GP.ord)
}